In [1]:
%matplotlib inline
import os
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn import cross_validation as cv
from sklearn.cross_validation import train_test_split as tts

from sklearn.linear_model import Ridge
from sklearn.linear_model import RandomizedLasso
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor


from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse


//anaconda/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Load and Merge Datasets


In [2]:
#Load the sensors dataset
sensor = pd.read_csv('sensor_updated.csv')
sensor.info()
sensor.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70989 entries, 0 to 70988
Data columns (total 7 columns):
datetime             70989 non-null object
temperature          3426 non-null float64
humidity             3426 non-null float64
co2                  3426 non-null float64
light                3426 non-null float64
noise                3426 non-null float64
bluetooth_devices    3426 non-null float64
dtypes: float64(6), object(1)
memory usage: 3.8+ MB
Out[2]:
datetime temperature humidity co2 light noise bluetooth_devices
0 2017-03-25 09:05:00 22.60 36.900000 781.000000 430.0 511.000000 1.000000
1 2017-03-25 09:06:00 23.80 38.950000 765.900000 426.9 502.000000 11.400000
2 2017-03-25 09:07:00 23.85 38.910000 768.300000 422.4 510.400000 19.600000
3 2017-03-25 09:08:00 23.90 38.772727 777.454545 424.0 506.909091 29.727273
4 2017-03-25 09:09:00 23.91 38.730000 770.800000 438.1 500.700000 35.900000

In [3]:
#Load the occupancy dataset
occupancy = pd.read_csv('image_variations.csv')
occupancy.info()
occupancy.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3551 entries, 0 to 3550
Data columns (total 4 columns):
datetime         3551 non-null object
control_F_rms    3551 non-null float64
control_L_rms    3551 non-null float64
rolling_rms      3551 non-null float64
dtypes: float64(3), object(1)
memory usage: 111.0+ KB
Out[3]:
datetime control_F_rms control_L_rms rolling_rms
0 2017-03-25 09:11:00 0.000000 68.764028 0.000000
1 2017-03-25 09:12:00 15.242697 69.110523 15.242697
2 2017-03-25 09:13:00 15.526992 69.169608 15.087697
3 2017-03-25 09:14:00 18.106792 69.253149 15.422978
4 2017-03-25 09:15:00 19.040465 69.159929 14.799398

In [4]:
#Merge the two datasets by datetime
df = pd.merge(sensor, occupancy[['datetime','rolling_rms']], on='datetime', how='inner')
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3551 entries, 0 to 3550
Data columns (total 8 columns):
datetime             3551 non-null object
temperature          3379 non-null float64
humidity             3379 non-null float64
co2                  3379 non-null float64
light                3379 non-null float64
noise                3379 non-null float64
bluetooth_devices    3379 non-null float64
rolling_rms          3551 non-null float64
dtypes: float64(7), object(1)
memory usage: 249.7+ KB

In [5]:
#Drop rows with any NaN values
df = df[pd.notnull(df['temperature'])]

In [6]:
#Round the rolling_rms feature to integer
df.rolling_rms = df.rolling_rms.round()

In [7]:
#Drop the datetime feature
df = df.drop('datetime', 1)

In [8]:
np.where(np.isnan(df))


Out[8]:
(array([], dtype=int64), array([], dtype=int64))

In [9]:
pd.scatter_matrix(df, alpha=0.2, figsize=(18,18), diagonal='kde')
plt.show()


Models -- Regression

note to self: Models predicting the number of occupancy, or the category? normalization or not?


In [10]:
df_features = df.ix[:,0:-1]
df_labels = df.ix[:,-1]

In [11]:
splits = cv.train_test_split(df_features, df_labels, test_size=0.2)
X_train, X_test, y_train, y_test = splits

In [12]:
model = Ridge(alpha=0.1)
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print("Ridge Regression model")
print("Mean Squared Error: %0.3f" % mse(expected, predicted))
print("Coefficient of Determination: %0.3f" % r2_score(expected, predicted))


Ridge Regression model
Mean Squared Error: 27.593
Coefficient of Determination: 0.106

In [13]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print("Random Forest model")
print("Mean squared error = %0.3f" % mse(expected, predicted))
print("R2 score = %0.3f" % r2_score(expected, predicted))


Random Forest model
Mean squared error = 18.246
R2 score = 0.409

Models -- Classification


In [14]:
import time
from sklearn import metrics
from sklearn import cross_validation
from sklearn.cross_validation import KFold

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [15]:
df.describe()


Out[15]:
temperature humidity co2 light noise bluetooth_devices rolling_rms
count 3379.000000 3379.000000 3379.000000 3379.000000 3379.000000 3379.000000 3379.000000
mean 23.125713 39.380875 1190.216754 470.305878 328.453681 242.665067 12.732169
std 1.546089 7.176134 164.681403 575.067659 176.818161 148.146838 5.569003
min 21.000000 21.190909 659.833333 143.000000 58.000000 0.000000 0.000000
25% 22.200000 37.489444 1074.477273 183.125000 143.916667 126.000000 10.000000
50% 22.866667 39.180000 1229.416667 222.625000 461.750000 207.416667 11.000000
75% 23.191667 45.487500 1300.625000 443.550000 503.900000 352.541667 15.000000
max 29.350000 50.730000 1724.200000 2891.583333 574.000000 634.833333 65.000000

In [16]:
def occupancy(c):
  if c['rolling_rms'] < 10:
    return '1'
  elif c['rolling_rms'] > 20:
    return '3'
  else:
    return '2'

df['occupancy'] = df.apply(occupancy, axis=1)
df = df.drop('rolling_rms', 1)

In [17]:
data   = df.iloc[:, 0:-1]
target = df.iloc[:, -1]

In [18]:
model = SVC()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)
accuracy = metrics.accuracy_score(expected, predicted)

print("SVM Classifier")
print("Mean squared error = %0.3f" % mse(expected, predicted))
print(accuracy)


SVM Classifier
Mean squared error = 38.938
0.137573964497

In [19]:
model = KNeighborsClassifier()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)
accuracy = metrics.accuracy_score(expected, predicted)

print("K Neighbors Classifier")
print("Mean squared error = %0.3f" % mse(expected, predicted))
print(accuracy)


K Neighbors Classifier
Mean squared error = 30.766
0.152366863905

In [20]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)
accuracy = metrics.accuracy_score(expected, predicted)

print("Random Forest Classifierr")
print("Mean squared error = %0.3f" % mse(expected, predicted))
print(accuracy)


Random Forest Classifierr
Mean squared error = 25.238
0.181952662722

In [ ]: